\documentclass{beamer}
\usetheme{ucb}
\usepackage{url}
\usepackage{graphicx}
\usepackage{algorithmic}
\newcommand{\fst}[2]{\subsection{#1}\frame{\frametitle{#1} #2}}
\DeclareMathOperator*{\argmax}{arg\,max}

\begin{document}

\frame{
	\frametitle{Coding Region Identification \\ via the Gabor Wavelet Transform}
	
	\begin{center}

		\includegraphics[width=.85\textwidth,trim=25mm 4mm 25mm 4mm, clip, keepaspectratio]{gabor.pdf}

		\textbf{Robert Lindsey} \\
		\textbf{Ian McNaney} \\
		\textbf{Federico Unglaub} \\
		December 3, 2009\\
	\end{center}
}

\section{Motivation}

\subsection{Gene Identification}

\frame{
   \frametitle{Motivation}
   \begin{itemize}
    \item Identify gene coding regions
    \item Can reveal new genes, improve knowledge about existing genes
    \item These genes are potential drug targets
    \item Distribution of coding regions is relevant to understanding gene regulation, splicing, etc.
   \end{itemize}
}

\subsection{Approaches}

\frame{
   \frametitle{Gene Prediction Algorithms}
   \begin{itemize}
	\item Several algorithms already exist
	\item No ``golden standard'' yet
	\item Many of these algorithms require \textit{a priori} information
   \end{itemize}
}

\frame{
   \frametitle{Model-dependent Algorithms}
   \begin{itemize}
    \item Make use of \textit{a priori} information by:
	\begin{itemize}
		\item Computing similarity to a known, annotated set of sequences
		\item Incorporating organism-specific knowledge such as codon usage preference
	\end{itemize}
    \item Weaknesses:
    \begin{itemize}
    	\item Learns the training sequence's mistakes as well as its correct classifications
    	\item Ability to identify novel coding regions or coding regions with characteristics that vary from the annotated training data is limited
    	\item Incorporation of organism-specific knowledge limits generality
    \end{itemize}
   \end{itemize}
}

\section{Problem Description}
\frame{
   \frametitle{Problem Description}
   \begin{itemize}
   	\item Identify exons in a way that:
   	\begin{itemize}
   		\item Requires no prior knowledge of the source organism
   		\item Makes no assumptions about organism-specific features such as codon preference
   		\item Makes no great sacrifice in accuracy to achieve generality
   	\end{itemize}
   \end{itemize}
}

\section{Approach}

\subsection{Periodicity}

\frame{
	\frametitle{Periodicity}
	\begin{itemize}
	  \item First some background
		\item Nucleotide periodicity of period length 3 is an established property of many exons across organisms
		\item If there is an A in a particular location it is more likely that there will be As in other locations within the exon that are offset by multiples of three nucleotides
		\item This holds for each of the nucleotides
	\end{itemize}
}

\frame{
	\frametitle{Periodicity}
	\begin{itemize}
		\item Why is this the case?
		\begin{itemize}
			\item One compelling explanation is that the organism's codon usage preference skews probability of particular nucleotides in particular locations
			\item When codon usage preferences are eliminated in simulated exons the periodicity disappears
			\item The fact that a codon is three nucleotides long causes the period length to be three
		\end{itemize}
		\item Note that not all exons exhibit such periodicity
		\item The following two images are from ``Periodicity of DNA in Exons'' by Steven K. Esken et. al, which can be found at http://www.biomedcentral.com/content/pdf/1471-2199-5-12.pdf
	\end{itemize}
}
\frame{
	\frametitle{H. Sapiens Adenine}
	\includegraphics[width=.99\textwidth,keepaspectratio]{adenine.PNG}
}

\frame{
	\frametitle{Multiple species periodicity}
	\includegraphics[width=0.99\textwidth,keepaspectratio]{best_fit.PNG}
}

\frame{
	\frametitle{Our Algorithm}

	\begin{center}
		\includegraphics[width=0.65\textwidth,keepaspectratio]{robPic1.pdf}

		The combined spectral energy tells us how much ``periodicity 3'' is present and where it is in a DNA sequence.
		\textbf{ \textcolor{uogreen}{Our algorithm predicts coding regions based solely on this quantity.} }
	\end{center}

}

\subsection{Time Series}


\frame{
	\frametitle{Numeric Encodings}

	\begin{itemize}
		\item A sequence of characters in $\{A,T,G,C\}$ is hard to analyze!
		\item We want a real-valued function $x(t)$ to analyze, where ``time'' $t$ is sequence position
		\item \textbf{Option 1:} \textcolor{uogreen}{Indicator Sequences}
		\begin{itemize}
			\item Binary-valued function
			\item Simplest possible encoding
		\end{itemize}
		\item \textbf{Option 2}: \textcolor{uogreen}{DNA walks}
		\begin{itemize}
			\item Assign weights to each nucleotide
			\item Set $x(t)$ as the cumulative sum of the weights from $1 \ldots t$
		\end{itemize}
	\end{itemize}
}

\frame{
	\frametitle{Example Indicator Sequences: C. Elegans}
	\begin{center}
	\includegraphics[angle=90,width=1\textwidth,keepaspectratio]{indicator.pdf}
	\end{center}
}

\frame{
	\frametitle{Example DNA Walks: C. Elegans}
	\begin{center}
	\includegraphics[angle=90,width=1\textwidth,keepaspectratio]{walk.pdf}
	\end{center}
}


\subsection{Spectral Analysis}

\frame{
	\frametitle{Spectral Analysis}
	\begin{itemize}
		\item $x_n(t)$ is a \textcolor{uogreen}{time domain} signal
		\item It's often easier to analyze signals in the \textcolor{uogreen}{frequency domain}
		\begin{itemize}
			\item Fourier transform tells us what frequencies (i.e., periodicities) are present in $x_n(t)$
			\item Since coding and non-coding regions have different frequency content, we want to also know \textbf{when} the component frequencies change
		\end{itemize}
		\item The \textcolor{uogreen}{wavelet transform} solves this problem for us
		\begin{itemize}
			\item Constructs a time-scale representation, where scale is analgous to frequency
			\item Method is similar to a windowed Fourier transform
		\end{itemize}
	\end{itemize}
}

\frame{
	\frametitle{Wavelet Transform}
	\begin{itemize}
		\item Gabor wavelet $\Psi(t)$: a harmonic function within a Gaussian envelope
	\end{itemize}
	\begin{center}
	\includegraphics[width=.55\textwidth,trim=25mm 4mm 25mm 4mm, clip, keepaspectratio]{gabor.pdf}
	\end{center}

	\begin{itemize}
		\item Idea: repeatedly stretch and translate $\Psi$ over signal $x(t)$
		\item $X(a,b) = \frac{1}{\sqrt{a}} \displaystyle\int\limits_{-\infty}^{\infty} x(t) \Psi^*(\frac{t-b}{a}) dt$
	\end{itemize}
}

\subsection{C. Elegans}

\frame{
	\begin{center}
	\includegraphics[height=1\textheight,keepaspectratio]{bioworm1.pdf}
	\end{center}
}


\frame{
	\frametitle{Wavelet Transform (cont.)}
	\begin{itemize}
		\item What do we do with X(a,b)? We tried looking at:
		\begin{itemize}
			\item Total energy in particular frequency bands as a function of time
			\item Difference in total energy between period 3 and period 10
			\item Wavelet entropy across all periods
			\item Variance across wavelet coefficients
			\item Average Mutual Information
		\end{itemize}
		\item ...when X(a,b) was defined using
		\begin{itemize}
			\item Indicator Sequences
			\item DNA Walks
		\end{itemize}
	\end{itemize}
}

\frame{
	\begin{center}
	\includegraphics[width=1\textwidth,keepaspectratio]{bioworm2.pdf}
	\end{center}
}


\section{Results and Conclusions}

\subsection{Burset}

\frame{
   \frametitle{Burset Dataset}
   \begin{itemize}
    \item An annotated dataset of 570 genes from various organisms
    \item Often used as benchmark data for new gene prediction algorithms
    \item Some of the organisms included are:
    \begin{itemize}
    	\item Gallus gallus (chicken)
	\item Homo Sapiens
	\item Flavobacteriaceae bacterium
	\item Mus musculus (house mouse)
	\item Ateles geoffroyi (black-handed spider monkey)
	\item Oryctolagus cuniculus (European rabbit)
    \end{itemize}
   \end{itemize}
}

\frame{
   \frametitle{ROC Curves}
   \begin{itemize}
    \item Sensitivity: the percentage of known exons that are correctly classified
    \item Specificity: the percentage of known non-exons that are correctly classified
    \item An ROC (Receiver Operating Characteristic) curve is a plot of sensitivity vs (1 - specificity), or the true positive rate vs the false positive rate
    \item Our classifier assigns a probability between 0 and 1 that each nucleotide is part of an exon
    \item We must select a threshold value between 0 and 1 to separate exon from non-exon nucleotides
  \end{itemize}
}

\frame{
  \frametitle{ROC Curves}
  \begin{itemize}
    \item If that threshold is 0 then the entire sequence is always classified as one long exon, giving perfect sensitivity but terrible specificity
    \item If that threshold is 1 then the entire sequence is classified as non-coding, giving terrible sensitivity but perfect specificity
    \item At any threshold value between 0 and 1 the sensitivity and specificity vary depending on the properties of the classifier, and we get a point on the ROC graph
    \item Plotting points corresponding to every possible threshold value between 0 and 1 gives an ROC curve
   \end{itemize}
}

\frame{
  \frametitle{ROC Curves}
  \begin{itemize}
  	\item The worst possible classifier is equivalent to random classification, which is a straight diagonal line on an ROC graph
  	\item If a classifier is initially worse than random we can simply reverse its classifications to achieve a better than random result
  	\item The area between a classifier's ROC curve and the diagonal line indicating random classification is a measure of the quality of the classifier's output
  \end{itemize}
}

\frame{
	\begin{center}
	\includegraphics[height=1\textheight,keepaspectratio]{bioroc1.pdf}
	\end{center}
}

\frame{
	\begin{center}
	\includegraphics[height=1\textheight,keepaspectratio]{bioroc2.pdf}
	\end{center}
}


\frame{
       \frametitle{Biological Implications}
       \begin{itemize}
               \item Novel sequences could be better analyzed if the exons are identified easily
               \item Directed mutation of these regions followed by studying resulting phenotypic effects would provide clues as to the function of the genes
               \item Amplification of these genes for further study via PCR requires knowledge of what part of the sequence the gene covers
               \item Periodicity could also be studied in the context of structural implications for the DNA helix, regulation of genes, epigenetics, etc.
       \end{itemize}
}

\frame{
       \frametitle{Implications for Gene Identification}
       \begin{itemize}
               \item Our algorithm shows where on the DNA the gene coding regions are, but does NOT specify which strand a gene's open reading frame is on (remember, DNA is a double helix, meaning 2 complementary strands/sequences of nucleotides)
               \item A-T    C-G  (Periodicity on one strand implies periodicity in the complementary strand)
               \item There are exons on both strands
               \item There is a 50\% chance that the algorithm will be analyzing the template (complementary) sequence to any specific exon (Assuming the exons are distributed evenly between both strands)
               \item Which open reading frames our algorithm will identify depends on which strand we use the algorithm on
       \end{itemize}
}



\subsection{Future Work}
\frame{
   \frametitle{Future Work}
   \begin{itemize}
    \item Possible computational extensions to our project:
    \begin{itemize}
    	\item Use of different wavelet functions (e.g., gamma transform)
	\item Make our algorithm model-dependent
    	\item Expand frequency analysis to include dinucleotides (AC, AG, etc...)
    \end{itemize}
   \end{itemize}
}
\subsection{Summary}
\frame{
   \frametitle{Summary}
   \begin{itemize}
    \item Exons can be identified with significantly better than random precision solely based on nucleotide periodicity
    \item Wavelet analysis is one approach that shows promise
    \item In combination with other approaches accuracy could be improved further
    \item Treatment of biological sequences as time sequences allows the application of signal processing techniques, which have generally not been applied to biological applications
   \end{itemize}
}


\end{document}
